#plotly scatter plot
airbnb_data %>%
mutate(text_label = str_c("Price: $", price, '\nRating: ', rating, '\nName: ', name)) %>%
plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
alpha = 0.5,
color = ~price,
text = ~text_label)
common_neighborhoods = airbnb_data %>%
count(neighbourhood, sort = TRUE) %>%
top_n(8) %>%
dplyr::select(neighbourhood)
## Selecting by n
#neighborhood-price
inner_join(airbnb_data, common_neighborhoods,
by = "neighbourhood") %>%
mutate(neighbourhood = fct_reorder(neighbourhood, price)) %>%
plot_ly(y = ~price, color = ~neighbourhood, type = "box",
colors = "Set2")
#?
airbnb_data %>%
count(neighbourhood) %>%
mutate(neighbourhood = fct_reorder(neighbourhood, n)) %>%
plot_ly(x = ~neighbourhood, y = ~n, color = ~neighbourhood, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#corplot
corr<-airbnb_data%>%
dplyr::select(-boro , -neighbourhood, -room_type, -name, -latitude, -longitude)
corrplot(cor(corr), method="square",shade.col=NA, tl.col="black", tl.srt=45)
corrplot(cor(airbnb), method = "square", shade.col = NA)
numeric_data = airbnb_data%>%
dplyr::select(price,everything(),-boro, -neighbourhood, -room_type, -name, -longitude, -latitude)
plot(numeric_data$price)
featurePlot(x=numeric_data[,2:5],y=numeric_data$price, plot = 'pairs')
#splitting data into train and test
sample_size=floor(0.75*nrow(numeric_data))
set.seed(1)
sample_air = sample(seq_len(nrow(numeric_data)), size = sample_size)
train = numeric_data[sample_air, ]
test = numeric_data[-sample_air, ]
#LSE on the traing data
ln_model = lm(price~., data=train)
pred_data = predict(ln_model, test)
test_error = mean((pred_data-test$price)^2)
test_error
## [1] 6779.265
#ridge regression
x_test = model.matrix(price~.,train)[,-1]
y_test = train$price
grid_ridge = 10^seq(10.,-5, length = 1000)
ridge_model = glmnet(x_test,y_test,alpha=0,lambda = grid_ridge)
cv.out = cv.glmnet(x_test,y_test,alpha=0, lambda = grid_ridge,
type.measure = "mse")
plot(cv.out)
best_lambda = cv.out$lambda.min
round(best_lambda,3)
## [1] 1.231
best_ridge_mod = glmnet(x_test,y_test,alpha=0,lambda = best_lambda)
reg_pred=predict(best_ridge_mod ,s=best_lambda,newx=x_test)
test_error2=
mean((reg_pred-y_test)^2);test_error2
## [1] 6535.136
set.seed(2)
grid_lasso = exp(seq(1,-8,length=100))
lasso_mod = glmnet(x_test,y_test,alpha=1,lambda= grid_lasso)
cv.out2 = cv.glmnet(x_test,y_test,alpha=1,lambda= grid_lasso)
best_lambda2 = cv.out2$lambda_min
plot(cv.out2)
pred_lasso = predict(lasso_mod ,s=best_lambda2, newx=x_test)
test_error3 = mean((pred_lasso-y_test)^2);test_error3
## [1] 6537.716
coefficients = predict(lasso_mod, s=best_lambda2, type="coefficients") %>%
as.matrix()
non_zero_coeff = coefficients[coefficients[,1] != 0,]
non_zero_coeff%>% knitr::kable()
| s0 | s1 | s2 | s3 | s4 | s5 | s6 | s7 | s8 | s9 | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | s22 | s23 | s24 | s25 | s26 | s27 | s28 | s29 | s30 | s31 | s32 | s33 | s34 | s35 | s36 | s37 | s38 | s39 | s40 | s41 | s42 | s43 | s44 | s45 | s46 | s47 | s48 | s49 | s50 | s51 | s52 | s53 | s54 | s55 | s56 | s57 | s58 | s59 | s60 | s61 | s62 | s63 | s64 | s65 | s66 | s67 | s68 | s69 | s70 | s71 | s72 | s73 | s74 | s75 | s76 | s77 | s78 | s79 | s80 | s81 | s82 | s83 | s84 | s85 | s86 | s87 | s88 | s89 | s90 | s91 | s92 | s93 | s94 | s95 | s96 | s97 | s98 | s99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | -14.6337302 | -17.5382337 | -20.1904861 | -22.5933756 | -24.7553207 | -26.7290819 | -28.5313247 | -30.1769539 | -31.7221908 | -33.1954397 | -34.5444728 | -35.7762205 | -36.9009294 | -37.927902 | -38.8656313 | -39.7218726 | -40.5037072 | -41.217601 | -41.8694577 | -42.4646687 | -43.0081563 | -43.4999213 | -43.9532638 | -44.3673820 | -44.7455244 | -45.0908072 | -45.4060852 | -45.6939658 | -45.9568297 | -46.1968510 | -46.4104660 | -46.6157557 | -46.7942310 | -46.9614130 | -47.1141433 | -47.2535970 | -47.3809307 | -47.4971988 | -47.603363 | -47.7003018 | -47.7888166 | -47.8696395 | -47.9434390 | -48.0108253 | -48.0723558 | -48.1285394 | -48.179841 | -48.2266839 | -48.2694564 | -48.3085121 | -48.3441738 | -48.3767366 | -48.4064697 | -48.4336190 | -48.458409 | -48.4810449 | -48.5017136 | -48.5205863 | -48.5378190 | -48.5535541 | -48.5679219 | -48.5810411 | -48.5930203 | -48.6039585 | -48.6139462 | -48.6230659 | -48.6313932 | -48.6389968 | -48.6459396 | -48.6522792 | -48.6580678 | -48.6633534 | -48.6681797 | -48.6725866 | -48.6766105 | -48.680285 | -48.6836398 | -48.6867032 | -48.6895004 | -48.6920546 | -48.6943867 | -48.6965163 | -48.6984607 | -48.7002362 | -48.7018574 | -48.7033377 | -48.7046894 | -48.7059237 | -48.707051 | -48.7080797 | -48.7090193 | -48.7098772 | -48.7106606 | -48.7113760 | -48.7120291 | -48.7126256 | -48.7131701 | -48.7136674 | -48.7141214 | -48.714536 |
| rating | 30.8261182 | 31.4158261 | 31.9543195 | 32.4511295 | 32.9139766 | 33.3365343 | 33.7223720 | 34.0746807 | 34.4031525 | 34.7120665 | 34.9951050 | 35.2535366 | 35.4895104 | 35.704978 | 35.9017222 | 36.0813691 | 36.2454049 | 36.395186 | 36.5319514 | 36.6568319 | 36.7708604 | 36.8740607 | 36.9691770 | 37.0560607 | 37.1353963 | 37.2078379 | 37.2739844 | 37.3343827 | 37.3895325 | 37.4398899 | 37.4848583 | 37.5277840 | 37.5653518 | 37.6004570 | 37.6324980 | 37.6617453 | 37.6884485 | 37.7128305 | 37.735093 | 37.7554218 | 37.7739837 | 37.7909325 | 37.8064084 | 37.8205395 | 37.8334426 | 37.8452245 | 37.855983 | 37.8658057 | 37.8747752 | 37.8829653 | 37.8904436 | 37.8972721 | 37.9035073 | 37.9092005 | 37.914399 | 37.9191459 | 37.9234802 | 37.9274378 | 37.9310516 | 37.9343513 | 37.9373642 | 37.9401154 | 37.9426274 | 37.9449212 | 37.9470157 | 37.9489281 | 37.9506743 | 37.9522688 | 37.9537248 | 37.9550542 | 37.9562681 | 37.9573765 | 37.9583886 | 37.9593127 | 37.9601566 | 37.960927 | 37.9616306 | 37.9622730 | 37.9628596 | 37.9633952 | 37.9638843 | 37.9643308 | 37.9647386 | 37.9651109 | 37.9654509 | 37.9657613 | 37.9660448 | 37.9663036 | 37.966540 | 37.9667557 | 37.9669528 | 37.9671327 | 37.9672970 | 37.9674470 | 37.9675839 | 37.9677090 | 37.9678232 | 37.9679275 | 37.9680227 | 37.968110 |
| availability_365 | 0.0232891 | 0.0256337 | 0.0277746 | 0.0298086 | 0.0318048 | 0.0336275 | 0.0352917 | 0.0368114 | 0.0381007 | 0.0390997 | 0.0400153 | 0.0408514 | 0.0416149 | 0.042312 | 0.0429485 | 0.0435297 | 0.0440604 | 0.044545 | 0.0449875 | 0.0453915 | 0.0457605 | 0.0461016 | 0.0464091 | 0.0466896 | 0.0469457 | 0.0471796 | 0.0473931 | 0.0475881 | 0.0477661 | 0.0479287 | 0.0480838 | 0.0482132 | 0.0483421 | 0.0484564 | 0.0485594 | 0.0486532 | 0.0487387 | 0.0488168 | 0.048888 | 0.0489531 | 0.0490125 | 0.0490668 | 0.0491163 | 0.0491616 | 0.0492029 | 0.0492406 | 0.049275 | 0.0493065 | 0.0493352 | 0.0493614 | 0.0493854 | 0.0494072 | 0.0494272 | 0.0494454 | 0.049462 | 0.0494772 | 0.0494911 | 0.0495038 | 0.0495154 | 0.0495259 | 0.0495356 | 0.0495444 | 0.0495524 | 0.0495598 | 0.0495665 | 0.0495726 | 0.0495782 | 0.0495833 | 0.0495879 | 0.0495922 | 0.0495961 | 0.0495996 | 0.0496029 | 0.0496058 | 0.0496085 | 0.049611 | 0.0496132 | 0.0496153 | 0.0496172 | 0.0496189 | 0.0496205 | 0.0496219 | 0.0496232 | 0.0496244 | 0.0496255 | 0.0496265 | 0.0496274 | 0.0496282 | 0.049629 | 0.0496296 | 0.0496303 | 0.0496309 | 0.0496314 | 0.0496319 | 0.0496323 | 0.0496327 | 0.0496331 | 0.0496334 | 0.0496337 | 0.049634 |
| calculated_host_listings_count | -2.6397419 | -2.7643308 | -2.8780926 | -2.9806611 | -3.0720127 | -3.1554270 | -3.2315928 | -3.3011397 | -3.3642121 | -3.4211202 | -3.4730549 | -3.5204772 | -3.5637785 | -3.603317 | -3.6394195 | -3.6723848 | -3.7024855 | -3.729970 | -3.7550669 | -3.7779825 | -3.7989068 | -3.8180578 | -3.8355016 | -3.8514277 | -3.8659696 | -3.8792479 | -3.8913723 | -3.9024431 | -3.9125518 | -3.9217821 | -3.9302497 | -3.9379103 | -3.9449663 | -3.9513840 | -3.9572407 | -3.9625878 | -3.9674700 | -3.9719280 | -3.975999 | -3.9797154 | -3.9831092 | -3.9862081 | -3.9890378 | -3.9916215 | -3.9939807 | -3.9961349 | -3.998102 | -3.9998979 | -4.0015379 | -4.0030354 | -4.0044027 | -4.0056513 | -4.0067913 | -4.0078323 | -4.008783 | -4.0096507 | -4.0104431 | -4.0111668 | -4.0118275 | -4.0124308 | -4.0129817 | -4.0134847 | -4.0139440 | -4.0143634 | -4.0147464 | -4.0150960 | -4.0154153 | -4.0157069 | -4.0159731 | -4.0162161 | -4.0164381 | -4.0166407 | -4.0168258 | -4.0169948 | -4.0171490 | -4.017290 | -4.0174186 | -4.0175360 | -4.0176433 | -4.0177412 | -4.0178306 | -4.0179123 | -4.0179868 | -4.0180549 | -4.0181171 | -4.0181738 | -4.0182256 | -4.0182730 | -4.018316 | -4.0183556 | -4.0183917 | -4.0184246 | -4.0184546 | -4.0184820 | -4.0185071 | -4.0185299 | -4.0185508 | -4.0185699 | -4.0185873 | -4.018603 |
set.seed(46)
train_price_tree = sample(1:nrow(airbnb), nrow(airbnb)/2)
# grow tree
fit_price = rpart(price ~ ., data = airbnb, subset = train_price_tree)
printcp(fit_price) # display the results
##
## Regression tree:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
##
## Variables actually used in tree construction:
## [1] availability_365 id neighbourhood room_type
##
## Root node error: 721426951/20376 = 35406
##
## n= 20376
##
## CP nsplit rel error xerror xstd
## 1 0.101805 0 1.00000 1.00007 0.16775
## 2 0.021830 1 0.89819 0.89845 0.16824
## 3 0.010812 2 0.87636 0.89125 0.16662
## 4 0.010460 3 0.86555 0.90215 0.16701
## 5 0.010000 4 0.85509 0.90196 0.16704
plotcp(fit_price) # visualize cross-validation results
summary(fit_price) # detailed summary of splits
## Call:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
## n= 20376
##
## CP nsplit rel error xerror xstd
## 1 0.10180536 0 1.0000000 1.0000674 0.1677542
## 2 0.02183025 1 0.8981946 0.8984541 0.1682425
## 3 0.01081207 2 0.8763644 0.8912506 0.1666185
## 4 0.01045992 3 0.8655523 0.9021475 0.1670055
## 5 0.01000000 4 0.8550924 0.9019622 0.1670371
##
## Variable importance
## room_type availability_365
## 46 10
## calculated_host_listings_count minimum_nights
## 9 8
## id review_scores_location
## 7 6
## number_of_reviews neighbourhood
## 5 5
## neighbourhood_group reviews_per_month
## 3 1
##
## Node number 1: 20376 observations, complexity param=0.1018054
## mean=145.2801, MSE=35405.72
## left son=2 (10398 obs) right son=3 (9978 obs)
## Primary splits:
## room_type < 1.5 to the right, improve=0.101805400, (0 missing)
## neighbourhood_group < 2.5 to the left, improve=0.015634360, (0 missing)
## neighbourhood < 29.5 to the left, improve=0.009643181, (0 missing)
## availability_365 < 364.5 to the left, improve=0.008202421, (0 missing)
## calculated_host_listings_count < 1.5 to the right, improve=0.005307350, (0 missing)
## Surrogate splits:
## calculated_host_listings_count < 1.5 to the right, agree=0.601, adj=0.186, (0 split)
## minimum_nights < 1.5 to the left, agree=0.591, adj=0.164, (0 split)
## review_scores_location < 9.5 to the left, agree=0.560, adj=0.101, (0 split)
## number_of_reviews < 4.5 to the left, agree=0.549, adj=0.078, (0 split)
## neighbourhood_group < 2.5 to the left, agree=0.543, adj=0.067, (0 split)
##
## Node number 2: 10398 observations
## mean=86.46769, MSE=21407.37
##
## Node number 3: 9978 observations, complexity param=0.02183025
## mean=206.568, MSE=42632.59
## left son=6 (9563 obs) right son=7 (415 obs)
## Primary splits:
## availability_365 < 360.5 to the left, improve=0.037022510, (0 missing)
## neighbourhood_group < 2.5 to the left, improve=0.019142500, (0 missing)
## neighbourhood < 29.5 to the left, improve=0.010184230, (0 missing)
## calculated_host_listings_count < 4.5 to the left, improve=0.009664178, (0 missing)
## reviews_per_month < 0.035 to the right, improve=0.009038157, (0 missing)
## Surrogate splits:
## calculated_host_listings_count < 11.5 to the left, agree=0.961, adj=0.058, (0 split)
##
## Node number 6: 9563 observations
## mean=198.2919, MSE=28469.96
##
## Node number 7: 415 observations, complexity param=0.01081207
## mean=397.2795, MSE=331038.2
## left son=14 (395 obs) right son=15 (20 obs)
## Primary splits:
## id < 16515870 to the left, improve=0.05677735, (0 missing)
## calculated_host_listings_count < 4.5 to the left, improve=0.03346697, (0 missing)
## minimum_nights < 1.5 to the right, improve=0.02794635, (0 missing)
## neighbourhood_group < 2.5 to the left, improve=0.02410042, (0 missing)
## reviews_per_month < 0.135 to the right, improve=0.02355728, (0 missing)
##
## Node number 14: 395 observations
## mean=366.4304, MSE=192384.9
##
## Node number 15: 20 observations, complexity param=0.01045992
## mean=1006.55, MSE=2679435
## left son=30 (8 obs) right son=31 (12 obs)
## Primary splits:
## neighbourhood < 93.5 to the left, improve=0.14081460, (0 missing)
## availability_365 < 363.5 to the left, improve=0.04684115, (0 missing)
## id < 16875470 to the right, improve=0.03264132, (0 missing)
## minimum_nights < 2.5 to the left, improve=0.01088263, (0 missing)
## Surrogate splits:
## id < 17254840 to the right, agree=0.80, adj=0.500, (0 split)
## review_scores_location < 4 to the right, agree=0.70, adj=0.250, (0 split)
## reviews_per_month < 0.5 to the right, agree=0.70, adj=0.250, (0 split)
## number_of_reviews < 0.5 to the right, agree=0.70, adj=0.250, (0 split)
## availability_365 < 362.5 to the left, agree=0.65, adj=0.125, (0 split)
##
## Node number 30: 8 observations
## mean=254.25, MSE=24762.94
##
## Node number 31: 12 observations
## mean=1508.083, MSE=3820377
plot(as.party(fit_price))
pred_fit_price = predict(fit_price, newdata = airbnb[-train_price_tree,])
# no pruning
ctree_price = ctree(price~., airbnb, subset = train_price_tree)
#summary(ctree.boston)
#plot(ctree_price) useless
pred_ctree_price = predict(ctree_price, newdata = airbnb[-train_price_tree,])
# tune over maximum depth, method = "rpart2" (plots Max Tree depth)
# tune over cp, method = "rpart" (plots Complexity Parameter)
rpartTune = train(airbnb[,-11], airbnb$price,
method = "rpart2",
trControl = trainControl(method = "cv", number =10))
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: Setting row names on a tibble is deprecated.
plot(rpartTune)
# create additional plots
# two plots on one page par(mfrow=c(1,2))
rsq.rpart(fit_price) # visualize cross-validation results
##
## Regression tree:
## rpart(formula = price ~ ., data = airbnb, subset = train_price_tree)
##
## Variables actually used in tree construction:
## [1] availability_365 id neighbourhood room_type
##
## Root node error: 721426951/20376 = 35406
##
## n= 20376
##
## CP nsplit rel error xerror xstd
## 1 0.101805 0 1.00000 1.00007 0.16775
## 2 0.021830 1 0.89819 0.89845 0.16824
## 3 0.010812 2 0.87636 0.89125 0.16662
## 4 0.010460 3 0.86555 0.90215 0.16701
## 5 0.010000 4 0.85509 0.90196 0.16704
# plot tree
plot(fit_price, uniform=TRUE, main="Regression Tree for Price ")
text(fit_price, use.n=TRUE, all=TRUE, cex=.8)
# prune the tree
pfit_price = prune(fit_price, cp=0.010460) # from cptable
# plot the pruned tree
plot(pfit_price, uniform=TRUE, main="Pruned Regression Tree for Price")
text(pfit_price, use.n=TRUE, all=TRUE, cex=.8)
set.seed(4)
train_lq = sample(1:dim(airbnb)[1], 15000, replace = FALSE)
ctrl = trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE)
lda.fit = lda(neighbourhood_group ~ ., data = airbnb, subset = train_lq)
lda.pred = predict(lda.fit, newdata = airbnb[-train_lq,])
#roc.lda = roc(airbnb$neighbourhood_group[-train_lq], lda.pred$posterior[,2])
#plot(roc.lda, legacy.axes = TRUE)
#ldafit1 = train(x = airbnb[,-1],
# y = airbnb$neighbourhood_group,
# method = "lda",
# preProc = c("center","scale"),
# metric = "ROC",
# trControl = ctrl)
#qda.fit = qda(price ~ Lag1+Lag2, data = airbnb ,subset = train_ql)
airbnb_tree_data = dplyr::select(airbnb_data, -neighbourhood, -reviews_per_month,- name, -latitude, -longitude)
set.seed(123)
n = nrow(airbnb_tree_data)
trainIndex = sample(1:n, size = round(0.5*n), replace=FALSE)
airbnb_tree_train = airbnb_tree_data[trainIndex ,]
airbnb_tree_test = airbnb_tree_data[-trainIndex ,]
#pruned tree
tree.airbnb <- tree(boro ~ ., data = airbnb_tree_train)
summary(tree.airbnb)
##
## Classification tree:
## tree(formula = boro ~ ., data = airbnb_tree_train)
## Variables actually used in tree construction:
## [1] "price"
## Number of terminal nodes: 3
## Residual mean deviance: 1.956 = 29370 / 15020
## Misclassification error rate: 0.4504 = 6765 / 15020
cv.tree.airbnb <- cv.tree(tree.airbnb, FUN = prune.misclass)
minsize=cv.tree.airbnb$size[which.min(cv.tree.airbnb$dev)]
prune.tree.airbnb <- prune.misclass(tree.airbnb, best = minsize)
summary(prune.tree.airbnb)
##
## Classification tree:
## tree(formula = boro ~ ., data = airbnb_tree_train)
## Variables actually used in tree construction:
## [1] "price"
## Number of terminal nodes: 3
## Residual mean deviance: 1.956 = 29370 / 15020
## Misclassification error rate: 0.4504 = 6765 / 15020
#pruned tree plot
plot(prune.tree.airbnb)
text(prune.tree.airbnb, pretty = 0)
#prediction
predict.pruned.tree <- predict(prune.tree.airbnb, airbnb_tree_test, type='class')
table(predict.pruned.tree, airbnb_tree_test$boro)
##
## predict.pruned.tree Bronx Brooklyn Manhattan Queens Staten Island
## Bronx 0 0 0 0 0
## Brooklyn 160 2872 1649 869 60
## Manhattan 64 3354 5415 545 32
## Queens 0 0 0 0 0
## Staten Island 0 0 0 0 0
basic.mse=mean(predict.pruned.tree != airbnb_tree_test$boro)
the optimal tree size equals to 3, the training data error rate is 0.4531. Use test dataset to predict, the error rate is 0.4448735.
bag.airbnb <- randomForest(boro ~ ., data = airbnb_tree_train, mtry = 10, ntree = 500, importance = TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within
## valid range
pred.bag.airbnb <- predict(bag.airbnb, newdata = airbnb_tree_test)
table(pred.bag.airbnb, airbnb_tree_test$boro )
##
## pred.bag.airbnb Bronx Brooklyn Manhattan Queens Staten Island
## Bronx 7 10 8 12 0
## Brooklyn 150 3434 2256 770 56
## Manhattan 40 2516 4662 419 22
## Queens 27 259 135 211 13
## Staten Island 0 7 3 2 1
bag.mse=mean(pred.bag.airbnb!= airbnb_tree_test$boro )
#0.4466045 test error.
varImpPlot(bag.airbnb)
#price and availability is important
rf.airbnb <- randomForest(boro ~ ., data = airbnb_tree_train, mtry = 5, ntree = 500, importance = TRUE)
pred.rf.airbnb <- predict(rf.airbnb, newdata = airbnb_tree_test)
table(pred.rf.airbnb, airbnb_tree_test$boro )
##
## pred.rf.airbnb Bronx Brooklyn Manhattan Queens Staten Island
## Bronx 5 8 10 11 0
## Brooklyn 154 3441 2217 784 62
## Manhattan 40 2530 4709 403 20
## Queens 25 239 124 214 10
## Staten Island 0 8 4 2 0
rf.mse=mean(pred.rf.airbnb!= airbnb_tree_test$boro )
#0.4442743 test error
varImpPlot(rf.airbnb)
##price and availability is important
airbnb_tree_train$boro <- as.numeric(airbnb_tree_train$boro == "Manhattan")
airbnb_tree_test$boro <- as.numeric(airbnb_tree_test$boro == "Manhattan")
boost.airbnb = gbm(boro ~ ., data = airbnb_tree_train, distribution = "bernoulli", n.trees = 5000, interaction.depth = 4)
yhat.boost = predict(boost.airbnb, newdata = airbnb_tree_test,
n.trees = 5000, type = "response")
pred.boost.airbnb <- ifelse(yhat.boost > 0.5, 1, 0)
table(pred.boost.airbnb, airbnb_tree_test$boro)
##
## pred.boost.airbnb 0 1
## 0 5745 2517
## 1 2211 4547
boost.mse = (2187 + 2596)/(2187 + 4483 + 2596 + 5754)
#(2187 + 2596)/(2187 + 4483 + 2596 + 5754) = 0.318442 test error
summary(boost.airbnb)
## var rel.inf
## price price 63.492283
## room_type room_type 14.741522
## rating rating 8.596436
## availability_365 availability_365 7.383736
## calculated_host_listings_count calculated_host_listings_count 3.586075
## number_of_reviews number_of_reviews 2.199948
#price is important
compare_df = data.frame(Boosting_MSE = boost.mse, Random_forest_MSE =rf.mse, Bagging_MSE = bag.mse, Decision_trees_MSE = basic.mse)
compare_df
## Boosting_MSE Random_forest_MSE Bagging_MSE Decision_trees_MSE
## 1 0.3184421 0.4428096 0.4464048 0.448269